package com.lheia.util

import com.lheia.jcb.spider.ApprovalPublicity
import com.lheia.jcb.spider.IpeSpiderData
import com.lheia.jcb.spider.PublicSentiment
import com.lheia.tool.ToolDateTime
import com.lheia.tool.ToolString
import org.jsoup.Jsoup

class test {
    static main(def args){
//        def doc = Jsoup.connect("http://www.hp.gov.cn/hp22/0302/201703/6c1006f4f6d444b3b43c0992e7636507.shtml").get()
//        def approvalPublicity = new ApprovalPublicity()
//        approvalPublicity.regionName  = "广东省"
//        approvalPublicity.regionCode = "广州市"
//        approvalPublicity.county = "黄埔区"
//        approvalPublicity.approveDept = "黄浦区环保局"
//        def htmlHead = doc.select("div.content_xxgkAttr").select("li")
//        approvalPublicity.pubDate = ToolDateTime.extractDate(htmlHead.get(3).toString())
//        approvalPublicity.title = doc.select("h1.content_title")
//        approvalPublicity.spiderFileUrl = doc.baseUri()
//        def trList = doc.select("table").select("tr:gt(1)")
//        for(def trNode:trList ){
//            def tdList = trNode.select("td")
//            approvalPublicity.projectName = tdList.get(0).text()
//            approvalPublicity.approveDate = ToolDateTime.extractDate(tdList.get(1).text())
//            println approvalPublicity
//        }
//
//        println ToolString.getTextFromHtml(doc.select("table").select("tr").toString())
        //**处罚**/
//        def document = Jsoup.connect("http://www.qz.gov.cn/module/xxgk/search.jsp?divid=div1529597&infotypeId=A0705&jdid=3084&area=113308000026182931&sortfield=&").post()
//        def tr = document.select("table.tb_main").select("tr:gt(0)")
//        def pageTotalNum = ToolString.matchList(document.select("table.tb_title").select(":containsOwn(记录)").text(),'\\d+').get(1)
//            println(pageTotalNum)

        def document = Jsoup.connect("http://www.qz.gov.cn/art/2019/3/20/art_1529723_31377173.html").get()
        println document.select("body > div.main > div > div > table:nth-child(4) > tbody > tr > td > table:nth-child(1) > tbody > tr > td").text()
        def all =  ToolString.getTextFromHtml(document.select("body > div.main > div > div > table:nth-child(4) > tbody > tr > td > table:nth-child(3) > tbody > tr > td").text())
        println ToolString.matchList(all,"(?<=项目名称：)(.*)(?=项目建设地点|建设地点)")
        println ToolString.matchList(all,"(?<=项目建设地点：|建设地点：)(.*?)(?=建设单位)")
        println ToolString.matchList(all,"(?<=建设单位：)(.*?)(?=建设项目概况|环评机构)")
//        println ToolString.matchList(all,"(?<=日期：|时间：)(.*?)(?=环评文件|原文链)")
        println document.select("body > div.main > div > div > table:nth-child(4) > tbody > tr > td > table:nth-child(3) > tbody > tr > td").select("a[href]").attr("href")
        println document.select("body > div.main > div > div > div > table > tbody > tr:nth-child(1) > td:nth-child(4)").text()
    }
}
